{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "uisVbKLZ16pl"
   },
   "source": [
    "# Train Word2Vec model\n",
    "This notebook is aim to prepare a vector file for further usage."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple\n",
      "Requirement already satisfied: jieba in /opt/conda/lib/python3.8/site-packages (0.42.1)\n",
      "Requirement already satisfied: tqdm in /opt/conda/lib/python3.8/site-packages (4.56.0)\n",
      "Requirement already satisfied: gensim in /opt/conda/lib/python3.8/site-packages (4.1.2)\n",
      "Requirement already satisfied: numpy>=1.17.0 in /opt/conda/lib/python3.8/site-packages (from gensim) (1.19.5)\n",
      "Requirement already satisfied: scipy>=0.18.1 in /opt/conda/lib/python3.8/site-packages (from gensim) (1.5.3)\n",
      "Requirement already satisfied: smart-open>=1.8.1 in /opt/conda/lib/python3.8/site-packages (from gensim) (5.2.1)\n"
     ]
    }
   ],
   "source": [
    "!pip install jieba tqdm gensim -i https://pypi.tuna.tsinghua.edu.cn/simple"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 55
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 974,
     "status": "ok",
     "timestamp": 1562676842917,
     "user": {
      "displayName": "F74051297王彥霖",
      "photoUrl": "",
      "userId": "11026930765053722807"
     },
     "user_tz": -480
    },
    "id": "6iDJahVq1z2C",
    "outputId": "d5371440-934e-4ea6-d8c1-3bdd78bb95d2"
   },
   "outputs": [],
   "source": [
    "import jieba\n",
    "import logging\n",
    "import pandas as pd\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/home/jovyan/work/pytorch/mta-lstm-pytorch/data\n"
     ]
    }
   ],
   "source": [
    "!pwd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "WHkZbawTe3Tf"
   },
   "outputs": [],
   "source": [
    "file_path = ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 72
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 5995,
     "status": "ok",
     "timestamp": 1562676851453,
     "user": {
      "displayName": "F74051297王彥霖",
      "photoUrl": "",
      "userId": "11026930765053722807"
     },
     "user_tz": -480
    },
    "id": "yCVetXmOI4Z3",
    "outputId": "2963bc93-fc6f-425b-99c6-de85f25c36a2"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 62%|██████▏   | 305001/494944 [00:01<00:00, 211940.64it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "extract 305001 articles\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "output = open(file_path+'composition_seg.txt', 'w', encoding='utf-8')\n",
    "num_lines = sum(1 for line in open(file_path+'composition.txt', 'r'))\n",
    "with open(file_path+'composition.txt') as f:\n",
    "    for idx, line in tqdm(enumerate(f), total=num_lines):\n",
    "        if idx > 305000:\n",
    "            print('\\nextract %d articles' % idx)\n",
    "            break\n",
    "        article = line.strip('\\n')\n",
    "        article, topics = article.split(' </d> ')\n",
    "        output.write(article)\n",
    "        output.write(' \\n')\n",
    "    f.close()\n",
    "    \n",
    "output.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "kqkTZZB2J6iW"
   },
   "outputs": [],
   "source": [
    "word2vec_params = {\n",
    "    'sg': 1,\n",
    "    \"vector_size\": 100,\n",
    "    \"alpha\": 0.01,\n",
    "    \"min_alpha\": 0.0005,\n",
    "    'window': 10,\n",
    "    'min_count': 1,\n",
    "    'seed': 1,\n",
    "    \"workers\": 24,\n",
    "    \"negative\": 0,\n",
    "    \"hs\": 1,\n",
    "    'compute_loss': True,\n",
    "    'epochs': 1,\n",
    "    'cbow_mean': 0,\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 1000
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 536177,
     "status": "ok",
     "timestamp": 1562683084645,
     "user": {
      "displayName": "F74051297王彥霖",
      "photoUrl": "",
      "userId": "11026930765053722807"
     },
     "user_tz": -480
    },
    "id": "JORzjKO9J9az",
    "outputId": "7578c451-ad64-4e94-9b40-e083a15eee36"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-04-21 13:19:28,466 : INFO : collecting all words and their counts\n",
      "2022-04-21 13:19:28,471 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n",
      "2022-04-21 13:19:28,683 : INFO : PROGRESS: at sentence #10000, processed 639038 words, keeping 37582 word types\n",
      "2022-04-21 13:19:28,906 : INFO : PROGRESS: at sentence #20000, processed 1281549 words, keeping 52142 word types\n",
      "2022-04-21 13:19:29,135 : INFO : PROGRESS: at sentence #30000, processed 1928922 words, keeping 63195 word types\n",
      "2022-04-21 13:19:29,354 : INFO : PROGRESS: at sentence #40000, processed 2574680 words, keeping 72472 word types\n",
      "2022-04-21 13:19:29,621 : INFO : PROGRESS: at sentence #50000, processed 3220850 words, keeping 81135 word types\n",
      "2022-04-21 13:19:29,864 : INFO : PROGRESS: at sentence #60000, processed 3869073 words, keeping 88856 word types\n",
      "2022-04-21 13:19:30,094 : INFO : PROGRESS: at sentence #70000, processed 4516620 words, keeping 95385 word types\n",
      "2022-04-21 13:19:30,318 : INFO : PROGRESS: at sentence #80000, processed 5159635 words, keeping 102036 word types\n",
      "2022-04-21 13:19:30,568 : INFO : PROGRESS: at sentence #90000, processed 5796725 words, keeping 107290 word types\n",
      "2022-04-21 13:19:30,775 : INFO : PROGRESS: at sentence #100000, processed 6427171 words, keeping 112169 word types\n",
      "2022-04-21 13:19:30,995 : INFO : PROGRESS: at sentence #110000, processed 7064307 words, keeping 116685 word types\n",
      "2022-04-21 13:19:31,222 : INFO : PROGRESS: at sentence #120000, processed 7698505 words, keeping 121729 word types\n",
      "2022-04-21 13:19:31,482 : INFO : PROGRESS: at sentence #130000, processed 8338144 words, keeping 125960 word types\n",
      "2022-04-21 13:19:31,705 : INFO : PROGRESS: at sentence #140000, processed 8976955 words, keeping 130319 word types\n",
      "2022-04-21 13:19:31,922 : INFO : PROGRESS: at sentence #150000, processed 9616685 words, keeping 135046 word types\n",
      "2022-04-21 13:19:32,139 : INFO : PROGRESS: at sentence #160000, processed 10255364 words, keeping 139068 word types\n",
      "2022-04-21 13:19:32,350 : INFO : PROGRESS: at sentence #170000, processed 10896337 words, keeping 142792 word types\n",
      "2022-04-21 13:19:32,568 : INFO : PROGRESS: at sentence #180000, processed 11536279 words, keeping 146758 word types\n",
      "2022-04-21 13:19:32,784 : INFO : PROGRESS: at sentence #190000, processed 12189324 words, keeping 151867 word types\n",
      "2022-04-21 13:19:32,988 : INFO : PROGRESS: at sentence #200000, processed 12834415 words, keeping 155719 word types\n",
      "2022-04-21 13:19:33,191 : INFO : PROGRESS: at sentence #210000, processed 13473888 words, keeping 159418 word types\n",
      "2022-04-21 13:19:33,401 : INFO : PROGRESS: at sentence #220000, processed 14117571 words, keeping 162905 word types\n",
      "2022-04-21 13:19:33,615 : INFO : PROGRESS: at sentence #230000, processed 14762635 words, keeping 166119 word types\n",
      "2022-04-21 13:19:33,830 : INFO : PROGRESS: at sentence #240000, processed 15402140 words, keeping 169693 word types\n",
      "2022-04-21 13:19:34,037 : INFO : PROGRESS: at sentence #250000, processed 16043245 words, keeping 172970 word types\n",
      "2022-04-21 13:19:34,253 : INFO : PROGRESS: at sentence #260000, processed 16681480 words, keeping 175845 word types\n",
      "2022-04-21 13:19:34,471 : INFO : PROGRESS: at sentence #270000, processed 17318486 words, keeping 179118 word types\n",
      "2022-04-21 13:19:34,682 : INFO : PROGRESS: at sentence #280000, processed 17953879 words, keeping 181964 word types\n",
      "2022-04-21 13:19:34,914 : INFO : PROGRESS: at sentence #290000, processed 18598868 words, keeping 185246 word types\n",
      "2022-04-21 13:19:35,123 : INFO : PROGRESS: at sentence #300000, processed 19240104 words, keeping 187837 word types\n",
      "2022-04-21 13:19:35,234 : INFO : collected 189609 word types from a corpus of 19563557 raw words and 305001 sentences\n",
      "2022-04-21 13:19:35,238 : INFO : Creating a fresh vocabulary\n",
      "2022-04-21 13:19:36,064 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 189609 unique words (100.0%% of original 189609, drops 0)', 'datetime': '2022-04-21T13:19:35.974938', 'gensim': '4.1.2', 'python': '3.8.6 | packaged by conda-forge | (default, Dec 26 2020, 05:05:16) \\n[GCC 9.3.0]', 'platform': 'Linux-4.19.0-10-amd64-x86_64-with-glibc2.10', 'event': 'prepare_vocab'}\n",
      "2022-04-21 13:19:36,068 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 19563557 word corpus (100.0%% of original 19563557, drops 0)', 'datetime': '2022-04-21T13:19:36.068148', 'gensim': '4.1.2', 'python': '3.8.6 | packaged by conda-forge | (default, Dec 26 2020, 05:05:16) \\n[GCC 9.3.0]', 'platform': 'Linux-4.19.0-10-amd64-x86_64-with-glibc2.10', 'event': 'prepare_vocab'}\n",
      "2022-04-21 13:19:37,242 : INFO : deleting the raw counts dictionary of 189609 items\n",
      "2022-04-21 13:19:37,249 : INFO : sample=0.001 downsamples 40 most-common words\n",
      "2022-04-21 13:19:37,252 : INFO : Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 14346949.229601195 word corpus (73.3%% of prior 19563557)', 'datetime': '2022-04-21T13:19:37.252877', 'gensim': '4.1.2', 'python': '3.8.6 | packaged by conda-forge | (default, Dec 26 2020, 05:05:16) \\n[GCC 9.3.0]', 'platform': 'Linux-4.19.0-10-amd64-x86_64-with-glibc2.10', 'event': 'prepare_vocab'}\n",
      "2022-04-21 13:19:37,409 : INFO : constructing a huffman tree from 189609 words\n",
      "2022-04-21 13:19:45,933 : INFO : built huffman tree with maximum node depth 24\n",
      "2022-04-21 13:19:46,057 : INFO : estimated required memory for 189609 words and 100 dimensions: 284413500 bytes\n",
      "2022-04-21 13:19:46,062 : INFO : resetting layer weights\n",
      "2022-04-21 13:19:46,306 : INFO : Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2022-04-21T13:19:46.306257', 'gensim': '4.1.2', 'python': '3.8.6 | packaged by conda-forge | (default, Dec 26 2020, 05:05:16) \\n[GCC 9.3.0]', 'platform': 'Linux-4.19.0-10-amd64-x86_64-with-glibc2.10', 'event': 'build_vocab'}\n",
      "2022-04-21 13:19:46,310 : INFO : Word2Vec lifecycle event {'msg': 'training model with 24 workers on 189609 vocabulary and 100 features, using sg=1 hs=1 sample=0.001 negative=0 window=10 shrink_windows=True', 'datetime': '2022-04-21T13:19:46.310343', 'gensim': '4.1.2', 'python': '3.8.6 | packaged by conda-forge | (default, Dec 26 2020, 05:05:16) \\n[GCC 9.3.0]', 'platform': 'Linux-4.19.0-10-amd64-x86_64-with-glibc2.10', 'event': 'train'}\n",
      "2022-04-21 13:19:47,955 : INFO : EPOCH 1 - PROGRESS: at 0.05% examples, 4443 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:19:48,975 : INFO : EPOCH 1 - PROGRESS: at 0.46% examples, 24759 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:19:49,975 : INFO : EPOCH 1 - PROGRESS: at 1.07% examples, 42020 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:19:51,105 : INFO : EPOCH 1 - PROGRESS: at 1.38% examples, 41277 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:19:52,108 : INFO : EPOCH 1 - PROGRESS: at 1.89% examples, 46698 words/s, in_qsize 0, out_qsize 1\n",
      "2022-04-21 13:19:53,190 : INFO : EPOCH 1 - PROGRESS: at 2.34% examples, 48848 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:19:54,212 : INFO : EPOCH 1 - PROGRESS: at 2.81% examples, 50869 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:19:55,453 : INFO : EPOCH 1 - PROGRESS: at 3.28% examples, 51143 words/s, in_qsize 0, out_qsize 1\n",
      "2022-04-21 13:19:56,675 : INFO : EPOCH 1 - PROGRESS: at 3.84% examples, 52855 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:19:57,842 : INFO : EPOCH 1 - PROGRESS: at 4.19% examples, 51917 words/s, in_qsize 0, out_qsize 2\n",
      "2022-04-21 13:19:58,943 : INFO : EPOCH 1 - PROGRESS: at 4.75% examples, 53755 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:00,072 : INFO : EPOCH 1 - PROGRESS: at 5.31% examples, 55200 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:01,181 : INFO : EPOCH 1 - PROGRESS: at 5.82% examples, 55945 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:02,239 : INFO : EPOCH 1 - PROGRESS: at 6.22% examples, 55867 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:03,390 : INFO : EPOCH 1 - PROGRESS: at 6.73% examples, 56382 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:04,460 : INFO : EPOCH 1 - PROGRESS: at 7.14% examples, 56279 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:05,509 : INFO : EPOCH 1 - PROGRESS: at 7.50% examples, 55874 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:06,691 : INFO : EPOCH 1 - PROGRESS: at 8.00% examples, 56224 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:07,835 : INFO : EPOCH 1 - PROGRESS: at 8.41% examples, 55947 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:08,946 : INFO : EPOCH 1 - PROGRESS: at 8.91% examples, 56406 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:10,145 : INFO : EPOCH 1 - PROGRESS: at 9.41% examples, 56631 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:11,176 : INFO : EPOCH 1 - PROGRESS: at 9.81% examples, 56635 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:12,191 : INFO : EPOCH 1 - PROGRESS: at 10.31% examples, 57239 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:13,263 : INFO : EPOCH 1 - PROGRESS: at 10.77% examples, 57385 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:14,445 : INFO : EPOCH 1 - PROGRESS: at 11.23% examples, 57315 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:15,537 : INFO : EPOCH 1 - PROGRESS: at 11.73% examples, 57659 words/s, in_qsize 1, out_qsize 0\n",
      "2022-04-21 13:20:16,639 : INFO : EPOCH 1 - PROGRESS: at 12.08% examples, 57239 words/s, in_qsize 0, out_qsize 2\n",
      "2022-04-21 13:20:17,654 : INFO : EPOCH 1 - PROGRESS: at 12.59% examples, 57723 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:18,736 : INFO : EPOCH 1 - PROGRESS: at 13.05% examples, 57817 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:19,890 : INFO : EPOCH 1 - PROGRESS: at 13.60% examples, 58202 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:21,232 : INFO : EPOCH 1 - PROGRESS: at 13.95% examples, 57424 words/s, in_qsize 0, out_qsize 3\n",
      "2022-04-21 13:20:22,368 : INFO : EPOCH 1 - PROGRESS: at 14.61% examples, 58244 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:23,406 : INFO : EPOCH 1 - PROGRESS: at 14.92% examples, 57790 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:24,465 : INFO : EPOCH 1 - PROGRESS: at 15.53% examples, 58478 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:25,503 : INFO : EPOCH 1 - PROGRESS: at 15.98% examples, 58601 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:26,552 : INFO : EPOCH 1 - PROGRESS: at 16.44% examples, 58714 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:27,613 : INFO : EPOCH 1 - PROGRESS: at 16.84% examples, 58624 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:28,719 : INFO : EPOCH 1 - PROGRESS: at 17.34% examples, 58835 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:29,765 : INFO : EPOCH 1 - PROGRESS: at 17.80% examples, 58918 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:30,890 : INFO : EPOCH 1 - PROGRESS: at 18.25% examples, 58907 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:32,028 : INFO : EPOCH 1 - PROGRESS: at 18.55% examples, 58403 words/s, in_qsize 0, out_qsize 4\n",
      "2022-04-21 13:20:33,029 : INFO : EPOCH 1 - PROGRESS: at 19.17% examples, 59022 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:34,180 : INFO : EPOCH 1 - PROGRESS: at 19.57% examples, 58829 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:35,213 : INFO : EPOCH 1 - PROGRESS: at 19.96% examples, 58795 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:36,236 : INFO : EPOCH 1 - PROGRESS: at 20.41% examples, 58907 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:37,374 : INFO : EPOCH 1 - PROGRESS: at 20.82% examples, 58740 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:38,390 : INFO : EPOCH 1 - PROGRESS: at 21.18% examples, 58580 words/s, in_qsize 0, out_qsize 2\n",
      "2022-04-21 13:20:39,462 : INFO : EPOCH 1 - PROGRESS: at 21.68% examples, 58771 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:40,658 : INFO : EPOCH 1 - PROGRESS: at 22.13% examples, 58687 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:41,661 : INFO : EPOCH 1 - PROGRESS: at 22.63% examples, 58960 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:42,992 : INFO : EPOCH 1 - PROGRESS: at 23.09% examples, 58735 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:44,199 : INFO : EPOCH 1 - PROGRESS: at 23.54% examples, 58653 words/s, in_qsize 0, out_qsize 2\n",
      "2022-04-21 13:20:45,320 : INFO : EPOCH 1 - PROGRESS: at 24.05% examples, 58785 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:46,326 : INFO : EPOCH 1 - PROGRESS: at 24.45% examples, 58767 words/s, in_qsize 0, out_qsize 2\n",
      "2022-04-21 13:20:47,334 : INFO : EPOCH 1 - PROGRESS: at 24.96% examples, 59004 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:48,414 : INFO : EPOCH 1 - PROGRESS: at 25.41% examples, 59032 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:49,532 : INFO : EPOCH 1 - PROGRESS: at 25.88% examples, 59021 words/s, in_qsize 0, out_qsize 1\n",
      "2022-04-21 13:20:50,709 : INFO : EPOCH 1 - PROGRESS: at 26.44% examples, 59185 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:51,882 : INFO : EPOCH 1 - PROGRESS: at 27.01% examples, 59341 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:53,056 : INFO : EPOCH 1 - PROGRESS: at 27.48% examples, 59266 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:54,077 : INFO : EPOCH 1 - PROGRESS: at 27.84% examples, 59123 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:55,114 : INFO : EPOCH 1 - PROGRESS: at 28.31% examples, 59192 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:56,167 : INFO : EPOCH 1 - PROGRESS: at 28.82% examples, 59345 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:57,193 : INFO : EPOCH 1 - PROGRESS: at 29.37% examples, 59604 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:58,300 : INFO : EPOCH 1 - PROGRESS: at 29.83% examples, 59586 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:20:59,364 : INFO : EPOCH 1 - PROGRESS: at 30.40% examples, 59804 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:00,511 : INFO : EPOCH 1 - PROGRESS: at 30.81% examples, 59660 words/s, in_qsize 0, out_qsize 3\n",
      "2022-04-21 13:21:01,601 : INFO : EPOCH 1 - PROGRESS: at 31.33% examples, 59761 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:02,616 : INFO : EPOCH 1 - PROGRESS: at 31.81% examples, 59823 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:03,925 : INFO : EPOCH 1 - PROGRESS: at 32.32% examples, 59763 words/s, in_qsize 0, out_qsize 2\n",
      "2022-04-21 13:21:05,047 : INFO : EPOCH 1 - PROGRESS: at 32.89% examples, 59936 words/s, in_qsize 0, out_qsize 1\n",
      "2022-04-21 13:21:06,077 : INFO : EPOCH 1 - PROGRESS: at 33.35% examples, 59982 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:07,103 : INFO : EPOCH 1 - PROGRESS: at 33.87% examples, 60122 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:08,127 : INFO : EPOCH 1 - PROGRESS: at 34.33% examples, 60164 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:09,202 : INFO : EPOCH 1 - PROGRESS: at 34.80% examples, 60166 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:10,267 : INFO : EPOCH 1 - PROGRESS: at 35.26% examples, 60180 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:11,353 : INFO : EPOCH 1 - PROGRESS: at 35.77% examples, 60271 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:12,487 : INFO : EPOCH 1 - PROGRESS: at 36.23% examples, 60233 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:13,500 : INFO : EPOCH 1 - PROGRESS: at 36.64% examples, 60202 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:14,555 : INFO : EPOCH 1 - PROGRESS: at 37.06% examples, 60145 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:15,645 : INFO : EPOCH 1 - PROGRESS: at 37.42% examples, 59986 words/s, in_qsize 0, out_qsize 1\n",
      "2022-04-21 13:21:16,696 : INFO : EPOCH 1 - PROGRESS: at 38.03% examples, 60280 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:17,792 : INFO : EPOCH 1 - PROGRESS: at 38.40% examples, 60124 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:19,107 : INFO : EPOCH 1 - PROGRESS: at 38.97% examples, 60148 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:20,195 : INFO : EPOCH 1 - PROGRESS: at 39.48% examples, 60228 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:21,432 : INFO : EPOCH 1 - PROGRESS: at 39.94% examples, 60135 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:22,525 : INFO : EPOCH 1 - PROGRESS: at 40.55% examples, 60365 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:23,602 : INFO : EPOCH 1 - PROGRESS: at 40.96% examples, 60301 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:24,854 : INFO : EPOCH 1 - PROGRESS: at 41.42% examples, 60197 words/s, in_qsize 0, out_qsize 5\n",
      "2022-04-21 13:21:26,088 : INFO : EPOCH 1 - PROGRESS: at 42.18% examples, 60538 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:27,147 : INFO : EPOCH 1 - PROGRESS: at 42.75% examples, 60688 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:28,252 : INFO : EPOCH 1 - PROGRESS: at 43.20% examples, 60668 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:29,538 : INFO : EPOCH 1 - PROGRESS: at 43.67% examples, 60539 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:30,757 : INFO : EPOCH 1 - PROGRESS: at 44.23% examples, 60592 words/s, in_qsize 0, out_qsize 1\n",
      "2022-04-21 13:21:31,760 : INFO : EPOCH 1 - PROGRESS: at 44.70% examples, 60636 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:32,988 : INFO : EPOCH 1 - PROGRESS: at 45.21% examples, 60620 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:34,048 : INFO : EPOCH 1 - PROGRESS: at 45.62% examples, 60563 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:35,193 : INFO : EPOCH 1 - PROGRESS: at 46.07% examples, 60541 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:36,276 : INFO : EPOCH 1 - PROGRESS: at 46.52% examples, 60556 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:37,332 : INFO : EPOCH 1 - PROGRESS: at 46.97% examples, 60585 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:38,423 : INFO : EPOCH 1 - PROGRESS: at 47.37% examples, 60523 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:39,465 : INFO : EPOCH 1 - PROGRESS: at 47.85% examples, 60551 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:40,469 : INFO : EPOCH 1 - PROGRESS: at 48.32% examples, 60597 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:41,579 : INFO : EPOCH 1 - PROGRESS: at 48.73% examples, 60522 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:42,835 : INFO : EPOCH 1 - PROGRESS: at 49.24% examples, 60501 words/s, in_qsize 0, out_qsize 1\n",
      "2022-04-21 13:21:43,854 : INFO : EPOCH 1 - PROGRESS: at 49.74% examples, 60594 words/s, in_qsize 0, out_qsize 1\n",
      "2022-04-21 13:21:44,938 : INFO : EPOCH 1 - PROGRESS: at 50.20% examples, 60598 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:46,044 : INFO : EPOCH 1 - PROGRESS: at 50.51% examples, 60409 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:47,095 : INFO : EPOCH 1 - PROGRESS: at 51.02% examples, 60490 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:48,384 : INFO : EPOCH 1 - PROGRESS: at 51.64% examples, 60565 words/s, in_qsize 0, out_qsize 1\n",
      "2022-04-21 13:21:49,462 : INFO : EPOCH 1 - PROGRESS: at 52.10% examples, 60569 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:50,567 : INFO : EPOCH 1 - PROGRESS: at 52.56% examples, 60562 words/s, in_qsize 0, out_qsize 1\n",
      "2022-04-21 13:21:51,699 : INFO : EPOCH 1 - PROGRESS: at 53.02% examples, 60540 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:52,776 : INFO : EPOCH 1 - PROGRESS: at 53.48% examples, 60546 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:53,861 : INFO : EPOCH 1 - PROGRESS: at 53.99% examples, 60600 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:54,906 : INFO : EPOCH 1 - PROGRESS: at 54.45% examples, 60616 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:56,007 : INFO : EPOCH 1 - PROGRESS: at 55.01% examples, 60715 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:57,010 : INFO : EPOCH 1 - PROGRESS: at 55.36% examples, 60637 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:21:58,156 : INFO : EPOCH 1 - PROGRESS: at 55.83% examples, 60605 words/s, in_qsize 0, out_qsize 1\n",
      "2022-04-21 13:21:59,228 : INFO : EPOCH 1 - PROGRESS: at 56.39% examples, 60715 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:00,250 : INFO : EPOCH 1 - PROGRESS: at 56.86% examples, 60737 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:01,430 : INFO : EPOCH 1 - PROGRESS: at 57.31% examples, 60688 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:02,494 : INFO : EPOCH 1 - PROGRESS: at 57.72% examples, 60641 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:03,584 : INFO : EPOCH 1 - PROGRESS: at 58.13% examples, 60586 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:04,864 : INFO : EPOCH 1 - PROGRESS: at 58.64% examples, 60557 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:05,900 : INFO : EPOCH 1 - PROGRESS: at 59.15% examples, 60636 words/s, in_qsize 1, out_qsize 0\n",
      "2022-04-21 13:22:07,184 : INFO : EPOCH 1 - PROGRESS: at 59.49% examples, 60452 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:08,217 : INFO : EPOCH 1 - PROGRESS: at 59.94% examples, 60487 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:09,459 : INFO : EPOCH 1 - PROGRESS: at 60.44% examples, 60487 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:10,706 : INFO : EPOCH 1 - PROGRESS: at 60.94% examples, 60483 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:11,707 : INFO : EPOCH 1 - PROGRESS: at 61.35% examples, 60474 words/s, in_qsize 0, out_qsize 1\n",
      "2022-04-21 13:22:12,909 : INFO : EPOCH 1 - PROGRESS: at 61.84% examples, 60484 words/s, in_qsize 0, out_qsize 1\n",
      "2022-04-21 13:22:13,992 : INFO : EPOCH 1 - PROGRESS: at 62.35% examples, 60539 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:15,055 : INFO : EPOCH 1 - PROGRESS: at 62.81% examples, 60552 words/s, in_qsize 0, out_qsize 1\n",
      "2022-04-21 13:22:16,070 : INFO : EPOCH 1 - PROGRESS: at 63.22% examples, 60535 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:17,116 : INFO : EPOCH 1 - PROGRESS: at 63.77% examples, 60641 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:18,256 : INFO : EPOCH 1 - PROGRESS: at 64.23% examples, 60616 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:19,393 : INFO : EPOCH 1 - PROGRESS: at 64.74% examples, 60642 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:20,446 : INFO : EPOCH 1 - PROGRESS: at 65.23% examples, 60703 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:21,533 : INFO : EPOCH 1 - PROGRESS: at 65.69% examples, 60700 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:22,721 : INFO : EPOCH 1 - PROGRESS: at 66.20% examples, 60707 words/s, in_qsize 0, out_qsize 1\n",
      "2022-04-21 13:22:23,790 : INFO : EPOCH 1 - PROGRESS: at 66.66% examples, 60716 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:24,839 : INFO : EPOCH 1 - PROGRESS: at 67.07% examples, 60686 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:25,898 : INFO : EPOCH 1 - PROGRESS: at 67.64% examples, 60792 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:26,976 : INFO : EPOCH 1 - PROGRESS: at 68.20% examples, 60885 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:28,181 : INFO : EPOCH 1 - PROGRESS: at 68.66% examples, 60835 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:29,286 : INFO : EPOCH 1 - PROGRESS: at 69.17% examples, 60867 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:30,328 : INFO : EPOCH 1 - PROGRESS: at 69.58% examples, 60835 words/s, in_qsize 0, out_qsize 2\n",
      "2022-04-21 13:22:31,384 : INFO : EPOCH 1 - PROGRESS: at 70.09% examples, 60886 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:32,482 : INFO : EPOCH 1 - PROGRESS: at 70.54% examples, 60878 words/s, in_qsize 1, out_qsize 3\n",
      "2022-04-21 13:22:33,521 : INFO : EPOCH 1 - PROGRESS: at 71.05% examples, 60934 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:34,587 : INFO : EPOCH 1 - PROGRESS: at 71.65% examples, 61065 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:35,671 : INFO : EPOCH 1 - PROGRESS: at 72.06% examples, 61019 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:36,989 : INFO : EPOCH 1 - PROGRESS: at 72.62% examples, 61020 words/s, in_qsize 0, out_qsize 2\n",
      "2022-04-21 13:22:38,168 : INFO : EPOCH 1 - PROGRESS: at 73.18% examples, 61071 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:39,378 : INFO : EPOCH 1 - PROGRESS: at 73.68% examples, 61065 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:40,456 : INFO : EPOCH 1 - PROGRESS: at 74.24% examples, 61148 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:41,479 : INFO : EPOCH 1 - PROGRESS: at 74.64% examples, 61126 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:42,625 : INFO : EPOCH 1 - PROGRESS: at 75.15% examples, 61143 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:43,657 : INFO : EPOCH 1 - PROGRESS: at 75.56% examples, 61119 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:44,725 : INFO : EPOCH 1 - PROGRESS: at 76.02% examples, 61123 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:45,786 : INFO : EPOCH 1 - PROGRESS: at 76.33% examples, 61006 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:46,910 : INFO : EPOCH 1 - PROGRESS: at 76.84% examples, 61032 words/s, in_qsize 0, out_qsize 1\n",
      "2022-04-21 13:22:47,978 : INFO : EPOCH 1 - PROGRESS: at 77.35% examples, 61075 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:49,021 : INFO : EPOCH 1 - PROGRESS: at 77.76% examples, 61048 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:50,186 : INFO : EPOCH 1 - PROGRESS: at 78.17% examples, 60986 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:51,341 : INFO : EPOCH 1 - PROGRESS: at 78.72% examples, 61041 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:52,432 : INFO : EPOCH 1 - PROGRESS: at 79.13% examples, 61003 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:53,442 : INFO : EPOCH 1 - PROGRESS: at 79.59% examples, 61030 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:54,480 : INFO : EPOCH 1 - PROGRESS: at 80.00% examples, 61008 words/s, in_qsize 0, out_qsize 1\n",
      "2022-04-21 13:22:55,513 : INFO : EPOCH 1 - PROGRESS: at 80.36% examples, 60951 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:56,773 : INFO : EPOCH 1 - PROGRESS: at 80.91% examples, 60974 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:57,785 : INFO : EPOCH 1 - PROGRESS: at 81.42% examples, 61031 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:58,821 : INFO : EPOCH 1 - PROGRESS: at 81.89% examples, 61043 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:22:59,868 : INFO : EPOCH 1 - PROGRESS: at 82.34% examples, 61048 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:23:00,905 : INFO : EPOCH 1 - PROGRESS: at 83.06% examples, 61243 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:23:02,056 : INFO : EPOCH 1 - PROGRESS: at 83.42% examples, 61142 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:23:03,118 : INFO : EPOCH 1 - PROGRESS: at 83.88% examples, 61148 words/s, in_qsize 0, out_qsize 3\n",
      "2022-04-21 13:23:04,220 : INFO : EPOCH 1 - PROGRESS: at 84.44% examples, 61215 words/s, in_qsize 1, out_qsize 0\n",
      "2022-04-21 13:23:05,312 : INFO : EPOCH 1 - PROGRESS: at 84.96% examples, 61245 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:23:06,452 : INFO : EPOCH 1 - PROGRESS: at 85.42% examples, 61222 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:23:07,473 : INFO : EPOCH 1 - PROGRESS: at 85.88% examples, 61236 words/s, in_qsize 0, out_qsize 1\n",
      "2022-04-21 13:23:08,496 : INFO : EPOCH 1 - PROGRESS: at 86.30% examples, 61213 words/s, in_qsize 0, out_qsize 2\n",
      "2022-04-21 13:23:09,497 : INFO : EPOCH 1 - PROGRESS: at 86.81% examples, 61269 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:23:10,498 : INFO : EPOCH 1 - PROGRESS: at 87.16% examples, 61220 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:23:11,512 : INFO : EPOCH 1 - PROGRESS: at 87.73% examples, 61311 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:23:12,592 : INFO : EPOCH 1 - PROGRESS: at 88.13% examples, 61271 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:23:13,654 : INFO : EPOCH 1 - PROGRESS: at 88.55% examples, 61239 words/s, in_qsize 0, out_qsize 1\n",
      "2022-04-21 13:23:14,830 : INFO : EPOCH 1 - PROGRESS: at 89.07% examples, 61246 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:23:15,919 : INFO : EPOCH 1 - PROGRESS: at 89.53% examples, 61240 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:23:16,977 : INFO : EPOCH 1 - PROGRESS: at 89.99% examples, 61244 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:23:18,078 : INFO : EPOCH 1 - PROGRESS: at 90.34% examples, 61172 words/s, in_qsize 1, out_qsize 2\n",
      "2022-04-21 13:23:19,103 : INFO : EPOCH 1 - PROGRESS: at 90.81% examples, 61187 words/s, in_qsize 0, out_qsize 1\n",
      "2022-04-21 13:23:20,206 : INFO : EPOCH 1 - PROGRESS: at 91.38% examples, 61245 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:23:21,263 : INFO : EPOCH 1 - PROGRESS: at 91.79% examples, 61215 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:23:22,297 : INFO : EPOCH 1 - PROGRESS: at 92.34% examples, 61295 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:23:23,332 : INFO : EPOCH 1 - PROGRESS: at 92.70% examples, 61239 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:23:24,342 : INFO : EPOCH 1 - PROGRESS: at 93.16% examples, 61255 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:23:25,596 : INFO : EPOCH 1 - PROGRESS: at 93.56% examples, 61170 words/s, in_qsize 0, out_qsize 3\n",
      "2022-04-21 13:23:26,808 : INFO : EPOCH 1 - PROGRESS: at 94.18% examples, 61233 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:23:27,854 : INFO : EPOCH 1 - PROGRESS: at 94.52% examples, 61181 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:23:28,940 : INFO : EPOCH 1 - PROGRESS: at 95.03% examples, 61218 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:23:30,141 : INFO : EPOCH 1 - PROGRESS: at 95.49% examples, 61188 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:23:31,331 : INFO : EPOCH 1 - PROGRESS: at 96.09% examples, 61260 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:23:32,421 : INFO : EPOCH 1 - PROGRESS: at 96.49% examples, 61225 words/s, in_qsize 0, out_qsize 1\n",
      "2022-04-21 13:23:33,576 : INFO : EPOCH 1 - PROGRESS: at 97.01% examples, 61235 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:23:34,845 : INFO : EPOCH 1 - PROGRESS: at 97.52% examples, 61213 words/s, in_qsize 0, out_qsize 3\n",
      "2022-04-21 13:23:35,972 : INFO : EPOCH 1 - PROGRESS: at 98.09% examples, 61261 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:23:37,019 : INFO : EPOCH 1 - PROGRESS: at 98.55% examples, 61268 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:23:38,044 : INFO : EPOCH 1 - PROGRESS: at 99.01% examples, 61283 words/s, in_qsize 0, out_qsize 0\n",
      "2022-04-21 13:23:38,134 : INFO : worker thread finished; awaiting finish of 23 more threads\n",
      "2022-04-21 13:23:38,138 : INFO : worker thread finished; awaiting finish of 22 more threads\n",
      "2022-04-21 13:23:38,140 : INFO : worker thread finished; awaiting finish of 21 more threads\n",
      "2022-04-21 13:23:38,141 : INFO : worker thread finished; awaiting finish of 20 more threads\n",
      "2022-04-21 13:23:38,142 : INFO : worker thread finished; awaiting finish of 19 more threads\n",
      "2022-04-21 13:23:38,178 : INFO : worker thread finished; awaiting finish of 18 more threads\n",
      "2022-04-21 13:23:38,374 : INFO : worker thread finished; awaiting finish of 17 more threads\n",
      "2022-04-21 13:23:38,439 : INFO : worker thread finished; awaiting finish of 16 more threads\n",
      "2022-04-21 13:23:38,643 : INFO : worker thread finished; awaiting finish of 15 more threads\n",
      "2022-04-21 13:23:38,645 : INFO : worker thread finished; awaiting finish of 14 more threads\n",
      "2022-04-21 13:23:38,799 : INFO : worker thread finished; awaiting finish of 13 more threads\n",
      "2022-04-21 13:23:38,838 : INFO : worker thread finished; awaiting finish of 12 more threads\n",
      "2022-04-21 13:23:38,903 : INFO : worker thread finished; awaiting finish of 11 more threads\n",
      "2022-04-21 13:23:39,002 : INFO : worker thread finished; awaiting finish of 10 more threads\n",
      "2022-04-21 13:23:39,075 : INFO : EPOCH 1 - PROGRESS: at 99.57% examples, 61364 words/s, in_qsize 9, out_qsize 1\n",
      "2022-04-21 13:23:39,077 : INFO : worker thread finished; awaiting finish of 9 more threads\n",
      "2022-04-21 13:23:39,105 : INFO : worker thread finished; awaiting finish of 8 more threads\n",
      "2022-04-21 13:23:39,190 : INFO : worker thread finished; awaiting finish of 7 more threads\n",
      "2022-04-21 13:23:39,244 : INFO : worker thread finished; awaiting finish of 6 more threads\n",
      "2022-04-21 13:23:39,287 : INFO : worker thread finished; awaiting finish of 5 more threads\n",
      "2022-04-21 13:23:39,304 : INFO : worker thread finished; awaiting finish of 4 more threads\n",
      "2022-04-21 13:23:39,314 : INFO : worker thread finished; awaiting finish of 3 more threads\n",
      "2022-04-21 13:23:39,336 : INFO : worker thread finished; awaiting finish of 2 more threads\n",
      "2022-04-21 13:23:39,348 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
      "2022-04-21 13:23:39,358 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
      "2022-04-21 13:23:39,360 : INFO : EPOCH - 1 : training on 19563557 raw words (14348299 effective words) took 233.0s, 61571 effective words/s\n",
      "2022-04-21 13:23:39,362 : INFO : Word2Vec lifecycle event {'msg': 'training on 19563557 raw words (14348299 effective words) took 233.0s, 61568 effective words/s', 'datetime': '2022-04-21T13:23:39.362430', 'gensim': '4.1.2', 'python': '3.8.6 | packaged by conda-forge | (default, Dec 26 2020, 05:05:16) \\n[GCC 9.3.0]', 'platform': 'Linux-4.19.0-10-amd64-x86_64-with-glibc2.10', 'event': 'train'}\n",
      "2022-04-21 13:23:39,363 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec(vocab=189609, vector_size=100, alpha=0.01)', 'datetime': '2022-04-21T13:23:39.363045', 'gensim': '4.1.2', 'python': '3.8.6 | packaged by conda-forge | (default, Dec 26 2020, 05:05:16) \\n[GCC 9.3.0]', 'platform': 'Linux-4.19.0-10-amd64-x86_64-with-glibc2.10', 'event': 'created'}\n"
     ]
    }
   ],
   "source": [
    "from gensim.models import word2vec\n",
    "\n",
    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n",
    "sentences = word2vec.LineSentence(file_path+\"composition_seg.txt\")\n",
    "model = word2vec.Word2Vec(sentences=sentences, **word2vec_params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 201
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 16586,
     "status": "ok",
     "timestamp": 1562683101222,
     "user": {
      "displayName": "F74051297王彥霖",
      "photoUrl": "",
      "userId": "11026930765053722807"
     },
     "user_tz": -480
    },
    "id": "UJQgB7hUKR8E",
    "outputId": "75e0dfd4-8778-47b7-8f29-2008811eb1d1"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-04-21 13:23:39,372 : INFO : Word2Vec lifecycle event {'fname_or_handle': 'composition_mincount_1_305000_vec_original.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-04-21T13:23:39.372870', 'gensim': '4.1.2', 'python': '3.8.6 | packaged by conda-forge | (default, Dec 26 2020, 05:05:16) \\n[GCC 9.3.0]', 'platform': 'Linux-4.19.0-10-amd64-x86_64-with-glibc2.10', 'event': 'saving'}\n",
      "2022-04-21 13:23:39,375 : INFO : storing np array 'vectors' to composition_mincount_1_305000_vec_original.model.wv.vectors.npy\n",
      "2022-04-21 13:23:39,439 : INFO : storing np array 'syn1' to composition_mincount_1_305000_vec_original.model.syn1.npy\n",
      "2022-04-21 13:23:39,501 : INFO : not storing attribute cum_table\n",
      "2022-04-21 13:23:41,324 : INFO : saved composition_mincount_1_305000_vec_original.model\n",
      "2022-04-21 13:23:41,540 : INFO : storing 189609x100 projection weights into composition_mincount_1_305000_vec_original.txt\n"
     ]
    }
   ],
   "source": [
    "model.save(\"composition_mincount_1_305000_vec_original.model\")\n",
    "out = file_path+'composition_mincount_1_305000_vec_original.txt'\n",
    "model.wv.save_word2vec_format(out, binary=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 274
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 45,
     "status": "ok",
     "timestamp": 1562683101224,
     "user": {
      "displayName": "F74051297王彥霖",
      "photoUrl": "",
      "userId": "11026930765053722807"
     },
     "user_tz": -480
    },
    "id": "YxnZqMS4Es5z",
    "outputId": "66164cbb-19db-4734-bc14-37b6440562ab"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('炎热', 0.8152037858963013),\n",
       " ('骄阳似火', 0.7880606055259705),\n",
       " ('烈日炎炎', 0.7584226727485657),\n",
       " ('夏季', 0.7223962545394897),\n",
       " ('夏日', 0.6854434609413147),\n",
       " ('酷热', 0.685324490070343),\n",
       " ('季节', 0.6601651906967163),\n",
       " ('冬天', 0.6582432985305786),\n",
       " ('盛夏', 0.6565781235694885),\n",
       " ('秋天', 0.6393560171127319)]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res = model.wv.most_similar(\"夏天\",topn = 10)\n",
    "res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 35
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 17,
     "status": "ok",
     "timestamp": 1562683101229,
     "user": {
      "displayName": "F74051297王彥霖",
      "photoUrl": "",
      "userId": "11026930765053722807"
     },
     "user_tz": -480
    },
    "id": "AkBKXrsH3P_h",
    "outputId": "07b41859-6ef0-4009-e32d-d0e861f6fb64"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "189609"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(model.wv.index_to_key)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "oUR8xGB9FcKS"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/opt/conda/bin/python3\n"
     ]
    }
   ],
   "source": [
    "!which python3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "Word2Vec.ipynb",
   "provenance": [],
   "version": "0.3.2"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
